In [ ]:
# COVID-19 Global Data Analysis

This project analyzes global COVID-19 case and death data from the WHO. It includes data cleaning, visualizations, country and regional comparisons, and geographic mapping using interactive choropleth charts.
Tools used: Python, Pandas, Seaborn, Matplotlib, Plotly.
Goals:
- Understand global trends and regional differences
- Identify hotspots and infection trajectories
- Normalize data using per capita metrics
In [13]:
import pandas as pd
covid_df= pd.read_csv("C:/Users/ADMIN/Desktop/pipi/covid/WHO-COVID-19-global-daily-data.csv")
#load the dataset
In [15]:
df.head(10).
### Initial Data Preview
Here we preview the first few rows of the dataset to understand its structure.
Out[15]:
Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths
0 2020-01-04 VC Saint Vincent and the Grenadines AMR NaN 0 NaN 0
1 2020-01-04 SN Senegal AFR NaN 0 NaN 0
2 2020-01-04 SB Solomon Islands WPR 0.0 0 0.0 0
3 2020-01-04 LK Sri Lanka SEAR 0.0 0 0.0 0
4 2020-01-04 SY Syrian Arab Republic EMR NaN 0 NaN 0
5 2020-01-04 TJ Tajikistan EUR NaN 0 NaN 0
6 2020-01-04 TH Thailand SEAR 0.0 0 0.0 0
7 2020-01-04 AE United Arab Emirates EMR NaN 0 NaN 0
8 2020-01-04 TZ United Republic of Tanzania AFR NaN 0 NaN 0
9 2020-01-04 VE Venezuela (Bolivarian Republic of) AMR NaN 0 NaN 0
In [17]:
df.info
Out[17]:
<bound method DataFrame.info of        Date_reported Country_code                           Country  \
0         2020-01-04           VC  Saint Vincent and the Grenadines   
1         2020-01-04           SN                           Senegal   
2         2020-01-04           SB                   Solomon Islands   
3         2020-01-04           LK                         Sri Lanka   
4         2020-01-04           SY              Syrian Arab Republic   
...              ...          ...                               ...   
472555    2025-05-25           MX                            Mexico   
472556    2025-05-25           MR                        Mauritania   
472557    2025-05-25           ML                              Mali   
472558    2025-05-25           ME                        Montenegro   
472559    2025-05-25           NC                     New Caledonia   

       WHO_region  New_cases  Cumulative_cases  New_deaths  Cumulative_deaths  
0             AMR        NaN                 0         NaN                  0  
1             AFR        NaN                 0         NaN                  0  
2             WPR        0.0                 0         0.0                  0  
3            SEAR        0.0                 0         0.0                  0  
4             EMR        NaN                 0         NaN                  0  
...           ...        ...               ...         ...                ...  
472555        AMR        NaN           7622513         NaN             334818  
472556        AFR        NaN             63889         NaN                997  
472557        AFR        NaN             33193         NaN                743  
472558        EUR        NaN            251280         NaN               2654  
472559        WPR        NaN             80203         NaN                314  

[472560 rows x 8 columns]>
In [9]:
df.describe
Out[9]:
<bound method NDFrame.describe of        Date_reported Country_code                           Country  \
0         2020-01-04           VC  Saint Vincent and the Grenadines   
1         2020-01-04           SN                           Senegal   
2         2020-01-04           SB                   Solomon Islands   
3         2020-01-04           LK                         Sri Lanka   
4         2020-01-04           SY              Syrian Arab Republic   
...              ...          ...                               ...   
472555    2025-05-25           MX                            Mexico   
472556    2025-05-25           MR                        Mauritania   
472557    2025-05-25           ML                              Mali   
472558    2025-05-25           ME                        Montenegro   
472559    2025-05-25           NC                     New Caledonia   

       WHO_region  New_cases  Cumulative_cases  New_deaths  Cumulative_deaths  
0             AMR        NaN                 0         NaN                  0  
1             AFR        NaN                 0         NaN                  0  
2             WPR        0.0                 0         0.0                  0  
3            SEAR        0.0                 0         0.0                  0  
4             EMR        NaN                 0         NaN                  0  
...           ...        ...               ...         ...                ...  
472555        AMR        NaN           7622513         NaN             334818  
472556        AFR        NaN             63889         NaN                997  
472557        AFR        NaN             33193         NaN                743  
472558        EUR        NaN            251280         NaN               2654  
472559        WPR        NaN             80203         NaN                314  

[472560 rows x 8 columns]>
In [11]:
df.isnull().sum()
Out[11]:
Date_reported             0
Country_code           1969
Country                   0
WHO_region                0
New_cases            263304
Cumulative_cases          0
New_deaths           319112
Cumulative_deaths         0
dtype: int64
In [21]:
## Data Cleaning

#In this step, we clean and prepare the COVID-19 dataset for analysis. This includes:

#Converting date columns
 #Handling missing values
 #Replacing negative values in case and death counts
In [23]:
df['Date_reported'] = pd.to_datetime(df['Date_reported'])  #converting date columns
In [25]:
df.isnull().sum()  #we check for missing data in all columns to determine if any rows need to be filled
Out[25]:
Date_reported             0
Country_code           1969
Country                   0
WHO_region                0
New_cases            263304
Cumulative_cases          0
New_deaths           319112
Cumulative_deaths         0
dtype: int64
In [29]:
df[df['Country_code'].isna()].head(10) #checking the country codes to identify the missing values
Out[29]:
Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths
207 2020-01-04 NaN Namibia AFR NaN 0 NaN 0
279 2020-01-05 NaN Namibia AFR NaN 0 NaN 0
688 2020-01-06 NaN Namibia AFR NaN 0 NaN 0
759 2020-01-07 NaN Namibia AFR NaN 0 NaN 0
1168 2020-01-08 NaN Namibia AFR NaN 0 NaN 0
1239 2020-01-09 NaN Namibia AFR NaN 0 NaN 0
1648 2020-01-10 NaN Namibia AFR NaN 0 NaN 0
1717 2020-01-11 NaN Namibia AFR NaN 0 NaN 0
2128 2020-01-12 NaN Namibia AFR NaN 0 NaN 0
2198 2020-01-13 NaN Namibia AFR NaN 0 NaN 0
In [37]:
import pycountry
def get_country_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        return None
In [40]:
mask = df['Country_code'].isna()
df.loc[mask, 'Country_code'] = df.loc[mask,'Country'].apply(get_country_code)
###  Automatically fill missing `Country_code` values using `Country` names
In [44]:
df['Country_code'].isna().sum() #confirms they are no missing values.
Out[44]:
0
In [54]:
missing_percentage = df[['New_cases', 'New_deaths']].isna().mean() * 100
print(missing_percentage) #more than half of the new cases and new deaths were missing
#used linear interpolation by country to fill in missing values
New_cases     55.718639
New_deaths    67.528356
dtype: float64
In [58]:
df['New_cases'] = df.groupby('Country')['New_cases'].transform(lambda x: x.interpolate(method='linear'))
df['New_deaths'] = df.groupby('Country')['New_deaths'].transform(lambda x: x.interpolate(method='linear'))
#used linear interpolation by country to fill in missing value
In [60]:
print(df[['New_cases', 'New_deaths']].isna().sum())
New_cases     17188
New_deaths    40146
dtype: int64
In [62]:
df['New_cases'] = df['New_cases'].fillna(0)
df['New_deaths'] = df['New_deaths'].fillna(0) #filled with 0 to make the data complete
In [78]:
df.isnull().sum() #confirmation
Out[78]:
Date_reported        0
Country_code         0
Country              0
WHO_region           0
New_cases            0
Cumulative_cases     0
New_deaths           0
Cumulative_deaths    0
dtype: int64
In [90]:
numeric_df = df.select_dtypes(include='number')
#selecting only numeric columns
In [92]:
negative_counts = (numeric_df < 0).sum()
print(negative_counts)
# checking for negative values
New_cases            743
Cumulative_cases       0
New_deaths             0
Cumulative_deaths      0
dtype: int64
In [94]:
df['New_cases'] = df['New_cases'].apply(lambda x: max(x, 0))
#remove the negative values
In [96]:
print((df['New_cases'] < 0).sum())  # Should return 0
#confirmation
0
In [98]:
df.describe()
Out[98]:
Date_reported New_cases Cumulative_cases New_deaths Cumulative_deaths
count 472560 4.725600e+05 4.725600e+05 472560.000000 4.725600e+05
mean 2022-09-14 00:00:00.000000256 3.742682e+03 2.001990e+06 29.733821 2.139459e+04
min 2020-01-04 00:00:00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00
25% 2021-05-10 00:00:00 2.000000e+00 6.265000e+03 0.000000 3.800000e+01
50% 2022-09-14 00:00:00 2.800000e+01 5.714900e+04 1.000000 6.810000e+02
75% 2024-01-19 00:00:00 3.300000e+02 6.432410e+05 4.000000 7.693000e+03
max 2025-05-25 00:00:00 6.966046e+06 1.034368e+08 44047.000000 1.224213e+06
std NaN 4.448369e+04 8.405211e+06 201.013603 8.597986e+04
In [109]:
high_case_threshold = 1000000
high_case_df=df[df['New_cases'] > high_case_threshold] 
In [111]:
high_case_df.sort_values('New_cases', ascending=False).head(10)
#reviewing days with extremely high new case counts
Out[111]:
Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths
260351 2022-12-23 CN China WPR 6966046.0 50447985 894.0 36318
259975 2022-12-22 CN China WPR 6434648.0 43481939 836.0 35424
260456 2022-12-24 CN China WPR 6327801.0 56775786 1308.0 37626
259872 2022-12-21 CN China WPR 5905312.0 37047291 628.0 34588
260831 2022-12-25 CN China WPR 5669864.0 62445650 1369.0 38995
259495 2022-12-20 CN China WPR 5102957.0 31141979 454.0 33960
260936 2022-12-26 CN China WPR 4768272.0 67213922 1394.0 40389
261311 2022-12-27 CN China WPR 4462481.0 71676403 1416.0 41805
261416 2022-12-28 CN China WPR 4356772.0 76033175 1845.0 43650
259392 2022-12-19 CN China WPR 4068849.0 26039022 360.0 33506
In [123]:
df['high_case_flag'] = df['New_cases'] > 100000 #flagging unusually high case counts
In [125]:
df[df['high_case_flag'] == True]
Out[125]:
Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths high_case_flag high_death_flag
67033 2020-10-09 FR France EUR 104347.571429 463008 653.142857 22134 True True
67366 2020-10-10 FR France EUR 110253.285714 463008 671.571429 22134 True True
67512 2020-10-11 FR France EUR 116159.000000 579167 690.000000 22824 True True
67847 2020-10-12 FR France EUR 123541.285714 579167 729.000000 22824 True True
67992 2020-10-13 FR France EUR 130923.571429 579167 768.000000 22824 True True
... ... ... ... ... ... ... ... ... ... ...
343252 2023-12-04 RU Russian Federation EUR 107174.000000 23511820 148.000000 400803 True False
343621 2023-12-05 RU Russian Federation EUR 105706.714286 23511820 150.285714 400803 True False
343732 2023-12-06 RU Russian Federation EUR 104239.428571 23511820 152.571429 400803 True False
344101 2023-12-07 RU Russian Federation EUR 102772.142857 23511820 154.857143 400803 True False
344214 2023-12-08 RU Russian Federation EUR 101304.857143 23511820 157.142857 400803 True False

3060 rows × 10 columns

In [121]:
df['high_death_flag'] = df['New_deaths'] > 200 
# Flagging Unusual Daily Death Counts
#Using a threshold of `New_deaths > 200` we flag rows with unusually high daily deaths. These are likely outliers due to data dumps or reporting issues.
In [119]:
df[df['high_death_flag'] == True]
Out[119]:
Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths high_case_flag high_death_flag
17634 2020-03-17 ES Spain EUR 18672.428571 13174 1021.285714 482 False True
17915 2020-03-18 ES Spain EUR 22207.142857 13174 1303.428571 482 False True
18114 2020-03-19 ES Spain EUR 25741.857143 13174 1585.571429 482 False True
18393 2020-03-20 ES Spain EUR 29276.571429 13174 1867.714286 482 False True
18595 2020-03-21 ES Spain EUR 32811.285714 13174 2149.857143 482 False True
... ... ... ... ... ... ... ... ... ... ...
439734 2025-01-09 US United States of America AMR 93260.000000 103436829 1034.000000 1213913 False True
440123 2025-01-10 US United States of America AMR 93260.000000 103436829 1027.000000 1213913 False True
440215 2025-01-11 US United States of America AMR 93260.000000 103436829 1020.000000 1213913 False True
440603 2025-01-12 US United States of America AMR 93260.000000 103436829 1013.000000 1213913 False True
440694 2025-01-13 US United States of America AMR 93260.000000 103436829 1006.000000 1213913 False True

2883 rows × 10 columns

In [127]:
df.drop_duplicates(inplace=True) #dropping duplicates
In [129]:
df.duplicated().sum()
Out[129]:
0
In [131]:
daily_cases = df.groupby('Date_reported')['New_cases'].sum().reset_index()
plt.figure(figsize=(14,6))
plt.plot(daily_cases['Date_reported'], daily_cases['New_cases'], color='blue', linewidth=1.5)
plt.title('Global Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.grid(True)
plt.tight_layout()
plt.show()
#Global Daily New Cases
No description has been provided for this image
In [139]:
daily_deaths= df.groupby('Date_reported')['New_deaths'].sum().reset_index()
plt.figure(figsize=(14,6))
plt.plot(daily_deaths['Date_reported'], daily_cases['New_cases'], color='red',linewidth=1.5)
plt.title('Global Daily COVID-19 deaths')
plt.xlabel('Date')
plt.ylabel('New Deaths')
plt.grid(True)
plt.tight_layout()
plt.show()
#global daily deaths
No description has been provided for this image
In [157]:
fig, ax1 = plt.subplots(figsize=(14, 6))
# Primary y-axis for cases
ax1.plot(daily_summary['Date_reported'], daily_summary['Cumulative_cases'], color='blue', label='Cumulative Cases')
ax1.set_xlabel('Date')
ax1.set_ylabel('Cumulative Cases', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
# Secondary y-axis for deaths
ax2 = ax1.twinx()
ax2.plot(daily_summary['Date_reported'], daily_summary['Cumulative_deaths'], color='red', label='Cumulative Deaths')
ax2.set_ylabel('Cumulative Deaths', color='red')
ax2.tick_params(axis='y', labelcolor='red')
plt.title('Global Cumulative COVID-19 Cases vs Deaths Over Time')
fig.tight_layout()
plt.grid(True)
plt.show()
# Cumulative Cases vs Deaths
                                         
                                             
No description has been provided for this image
In [167]:
latest_date = df['Date_reported'].max()
top10 = df[df['Date_reported'] == latest_date]\
         .groupby('Country')['Cumulative_cases'].sum()\
         .sort_values(ascending = False)\
         .head(10)
top10.plot(kind='bar', color='orange', figsize=(15,10))
plt.title('Top 10 Countries by Cumulative Cases (Latest Date)')
plt.ylabel('Cumulative Cases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
#Top 10 Countries by Cumulative Cases
No description has been provided for this image
In [181]:
from scipy.signal import find_peaks
In [185]:
daily= df.groupby('Date_reported')[['New_cases', 'New_deaths']].sum().reset_index()
# Detect peaks in new cases and new deaths
peaks_cases, _ = find_peaks(daily['New_cases'], distance=20, prominence=1e4)
peaks_deaths, _ = find_peaks(daily['New_deaths'], distance=20, prominence=500)
 
In [187]:
plt.figure(figsize=(14, 6))
plt.plot(daily['Date_reported'], daily['New_cases'], label='New Cases', color='blue')
plt.plot(daily['Date_reported'].iloc[peaks_cases], daily['New_cases'].iloc[peaks_cases], "x", label='Peaks', color='black')
plt.title("Global Daily New COVID-19 Cases with Peaks")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [189]:
plt.figure(figsize=(14, 6))
plt.plot(daily['Date_reported'], daily['New_deaths'], label='New Deaths', color='red')
plt.plot(daily['Date_reported'].iloc[peaks_deaths], daily['New_deaths'].iloc[peaks_deaths], "x", label='Peaks', color='black')
plt.title("Global Daily New COVID-19 Deaths with Peaks")
plt.xlabel("Date")
plt.ylabel("New Deaths")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [191]:
# Drop accidental duplicate column if it exists
if 'New_Cases' in df.columns:
    df.drop(columns='New_Cases', inplace=True)
In [197]:
# Ensure date is datetime
df['Date_reported'] = pd.to_datetime(df['Date_reported'])

# Get top 10 countries by total cumulative cases
top_10_countries = df.groupby('Country')['Cumulative_cases'].max().sort_values(ascending=False).head(10).index

# Filter data for just those countries
top_10_df = df[df['Country'].isin(top_10_countries)]

# Plot
plt.figure(figsize=(14, 6))
for country in top_10_countries:
    country_data = top_10_df[top_10_df['Country'] == country]
    plt.plot(country_data['Date_reported'], country_data['Cumulative_cases'], label=country)

plt.title('Cumulative COVID-19 Cases Over Time - Top 10 Countries')
plt.xlabel('Date')
plt.ylabel('Cumulative Cases')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [201]:
region_peaks = df.groupby('WHO_region')['New_cases'].max().sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=region_peaks.values, y=region_peaks.index, palette='magma')
plt.title('Peak Daily New Cases by WHO Region')
plt.xlabel('Peak New Cases')
plt.ylabel('WHO Region')
plt.tight_layout()
plt.show().
#Group by WHO region and get peak new cases per region
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_2016\310614675.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=region_peaks.values, y=region_peaks.index, palette='magma')
No description has been provided for this image
In [205]:
region_daily = df.groupby(['Date_reported', 'WHO_region'])['New_cases'].sum().reset_index()

plt.figure(figsize=(14,6))
for region in df['WHO_region'].unique():
    region_data = region_daily[region_daily['WHO_region'] == region]
    plt.plot(region_data['Date_reported'], region_data['New_cases'], label=region)

plt.title('Daily New COVID-19 Cases by WHO Region')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.tight_layout()
plt.show()
#compares total dailly new cases across WHO regions which were most affected overtime
No description has been provided for this image
In [209]:
import plotly.express as px

# Get total cases per country
total_cases_by_country = df.groupby('Country')['Cumulative_cases'].max().reset_index()

# Rename for clarity
total_cases_by_country.columns = ['Country', 'Total_Cases']
In [211]:
fig = px.choropleth(
    total_cases_by_country,
    locations='Country',
    locationmode='country names',
    color='Total_Cases',
    hover_name='Country',
    color_continuous_scale='Reds',
    title='Total COVID-19 Cases by Country'
)

fig.update_layout(geo=dict(showframe=False, showcoastlines=False))
fig.show()
### World Map: Total COVID-19 Cases by Country
#This interactive choropleth map visualizes the total cumulative COVID-19 cases by country. Darker red areas represent countries with higher case counts.
In [213]:
# Get total deaths per country
total_deaths_by_country = df.groupby('Country')['Cumulative_deaths'].max().reset_index()
# Rename for clarity
total_deaths_by_country.columns = ['Country', 'Total_Deaths']
In [215]:
fig = px.choropleth(
    total_deaths_by_country,
    locations='Country',
    locationmode='country names',
    color='Total_Deaths',
    hover_name='Country',
    color_continuous_scale='Blues',
    title='Total COVID-19 Deaths by Country'
)

fig.update_layout(geo=dict(showframe=False, showcoastlines=False))
fig.show()
#### World Map: Total COVID-19 Deaths by Country

#This interactive choropleth map shows the total number of COVID-19 deaths reported by each country. Darker blue shades indicate higher death tolls.
In [223]:
import os
print(os.getcwd())
C:\Users\ADMIN
In [ ]: